{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6789eab6-9237-41f7-8323-9f0cd0d6e3d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============GPU================\n",
      "Thu May  9 01:17:34 2024       \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |\n",
      "|-----------------------------------------+----------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                      |               MIG M. |\n",
      "|=========================================+======================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3070        On  | 00000000:C1:00.0 Off |                  N/A |\n",
      "| 85%   32C    P8              27W / 220W |      2MiB /  8192MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "                                                                                         \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                            |\n",
      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
      "|        ID   ID                                                             Usage      |\n",
      "|=======================================================================================|\n",
      "|  No running processes found                                                           |\n",
      "+---------------------------------------------------------------------------------------+\n",
      "============CUDA version================\n",
      "nvcc: NVIDIA (R) Cuda compiler driver\n",
      "Copyright (c) 2005-2023 NVIDIA Corporation\n",
      "Built on Mon_Apr__3_17:16:06_PDT_2023\n",
      "Cuda compilation tools, release 12.1, V12.1.105\n",
      "Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "============CPU================\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "model name\t: AMD EPYC 7443P 24-Core Processor\n",
      "============Memory================\n",
      "MemTotal:       263774780 kB\n"
     ]
    }
   ],
   "source": [
    "# GPU\n",
    "print(\"============GPU================\")\n",
    "!nvidia-smi\n",
    "\n",
    "# CUDA version\n",
    "print(\"============CUDA version================\")\n",
    "!nvcc --version\n",
    "\n",
    "# CPU\n",
    "print(\"============CPU================\")\n",
    "!cat /proc/cpuinfo | grep model\\ name\n",
    "\n",
    "# Memory\n",
    "print(\"============Memory================\")\n",
    "!cat /proc/meminfo | grep MemTotal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b9583f02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'llama.cpp'...\n",
      "remote: Enumerating objects: 24130, done.\u001b[K\n",
      "remote: Counting objects: 100% (9002/9002), done.\u001b[K\n",
      "remote: Compressing objects: 100% (534/534), done.\u001b[K\n",
      "remote: Total 24130 (delta 8724), reused 8526 (delta 8467), pack-reused 15128\u001b[K\n",
      "Receiving objects: 100% (24130/24130), 45.90 MiB | 27.03 MiB/s, done.\n",
      "Resolving deltas: 100% (17143/17143), done.\n",
      "/workspace/llama.cpp\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
      "  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
     ]
    }
   ],
   "source": [
    "# Get the code for the first time use\n",
    "!git clone https://github.com/ggerganov/llama.cpp\n",
    "%cd llama.cpp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4f651103-a6c8-4b7d-9f0f-be0553b22acf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml-vocab-aquila.gguf\t\t    ggml-vocab-llama-bpe.gguf\n",
      "ggml-vocab-baichuan.gguf\t    ggml-vocab-llama-bpe.gguf.inp\n",
      "ggml-vocab-bert-bge.gguf\t    ggml-vocab-llama-bpe.gguf.out\n",
      "ggml-vocab-bert-bge.gguf.inp\t    ggml-vocab-llama-spm.gguf\n",
      "ggml-vocab-bert-bge.gguf.out\t    ggml-vocab-llama-spm.gguf.inp\n",
      "ggml-vocab-command-r.gguf\t    ggml-vocab-llama-spm.gguf.out\n",
      "ggml-vocab-command-r.gguf.inp\t    ggml-vocab-mpt.gguf\n",
      "ggml-vocab-command-r.gguf.out\t    ggml-vocab-mpt.gguf.inp\n",
      "ggml-vocab-deepseek-coder.gguf\t    ggml-vocab-mpt.gguf.out\n",
      "ggml-vocab-deepseek-coder.gguf.inp  ggml-vocab-phi-3.gguf\n",
      "ggml-vocab-deepseek-coder.gguf.out  ggml-vocab-phi-3.gguf.inp\n",
      "ggml-vocab-deepseek-llm.gguf\t    ggml-vocab-phi-3.gguf.out\n",
      "ggml-vocab-deepseek-llm.gguf.inp    ggml-vocab-qwen2.gguf\n",
      "ggml-vocab-deepseek-llm.gguf.out    ggml-vocab-qwen2.gguf.inp\n",
      "ggml-vocab-falcon.gguf\t\t    ggml-vocab-qwen2.gguf.out\n",
      "ggml-vocab-falcon.gguf.inp\t    ggml-vocab-refact.gguf\n",
      "ggml-vocab-falcon.gguf.out\t    ggml-vocab-refact.gguf.inp\n",
      "ggml-vocab-gpt-2.gguf\t\t    ggml-vocab-refact.gguf.out\n",
      "ggml-vocab-gpt-2.gguf.inp\t    ggml-vocab-stablelm.gguf\n",
      "ggml-vocab-gpt-2.gguf.out\t    ggml-vocab-starcoder.gguf\n",
      "ggml-vocab-gpt-neox.gguf\t    ggml-vocab-starcoder.gguf.inp\n",
      "ggml-vocab-gpt2.gguf\t\t    ggml-vocab-starcoder.gguf.out\n"
     ]
    }
   ],
   "source": [
    "# Obtain the original LLaMA model weights and place them in ./models\n",
    "!ls ./models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "58f434b3-1493-448d-95d3-8954c1aa1267",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]\n",
      "Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]      \u001b[0m\u001b[33m\n",
      "Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]\n",
      "Get:4 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                \u001b[0m\u001b[33m\n",
      "Get:5 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1789 kB][33m\u001b[33m\n",
      "Get:6 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [44.7 kB]\n",
      "Get:7 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1078 kB]\n",
      "Get:8 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2308 kB]33m\n",
      "Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]3m\u001b[33m\u001b[33m\n",
      "Get:10 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]\n",
      "Get:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [27.8 kB]\n",
      "Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]\n",
      "Get:13 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]\n",
      "Get:14 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]\n",
      "Get:15 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 Packages [266 kB]\n",
      "Get:16 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]\n",
      "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [2382 kB]\n",
      "Get:18 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2069 kB]\n",
      "Get:19 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [51.1 kB]\n",
      "Get:20 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1373 kB]33m\n",
      "Get:21 http://archive.ubuntu.com/ubuntu jammy-backports/universe amd64 Packages [31.9 kB]\n",
      "Get:22 http://archive.ubuntu.com/ubuntu jammy-backports/main amd64 Packages [81.0 kB]\n",
      "Fetched 32.4 MB in 3s (9642 kB/s)[33m0m\u001b[33m\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "56 packages can be upgraded. Run 'apt list --upgradable' to see them.\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "The following NEW packages will be installed:\n",
      "  git-lfs\n",
      "0 upgraded, 1 newly installed, 0 to remove and 56 not upgraded.\n",
      "Need to get 3503 kB of archives.\n",
      "After this operation, 10.4 MB of additional disk space will be used.\n",
      "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 git-lfs amd64 3.0.2-1ubuntu0.2 [3503 kB]\n",
      "Fetched 3503 kB in 1s (3258 kB/s)[0mm\u001b[33m\u001b[33m\n",
      "debconf: delaying package configuration, since apt-utils is not installed\n",
      "\n",
      "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
      "(Reading database ... 21038 files and directories currently installed.)\n",
      "Preparing to unpack .../git-lfs_3.0.2-1ubuntu0.2_amd64.deb ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [  0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8\n",
      "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
     ]
    }
   ],
   "source": [
    "# Get git-lfs to clone large files\n",
    "!apt update\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4e63016a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp/models\n",
      "Updated git hooks.\n",
      "Git LFS initialized.\n",
      "Cloning into 'Llama-3-8B-GGUF'...\n",
      "remote: Enumerating objects: 44, done.\u001b[K\n",
      "remote: Counting objects: 100% (41/41), done.\u001b[K\n",
      "remote: Compressing objects: 100% (41/41), done.\u001b[K\n",
      "remote: Total 44 (delta 9), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
      "Unpacking objects: 100% (44/44), 2.25 MiB | 3.41 MiB/s, done.\n",
      "Filtering content: 100% (8/8), 9.46 GiB | 18.40 MiB/s, done.\n",
      "Encountered 6 file(s) that may not have been copied correctly on Windows:\n",
      "\tggml-model-Q4_K_M.gguf\n",
      "\tMeta-Llama-3-8B/model-00003-of-00004.safetensors\n",
      "\tMeta-Llama-3-8B/model-00001-of-00004.safetensors\n",
      "\tMeta-Llama-3-8B/model-00002-of-00004.safetensors\n",
      "\tMeta-Llama-3-8B/original/consolidated.00.pth\n",
      "\tggml-model-f16.gguf\n",
      "\n",
      "See: `git lfs help smudge` for more details.\n"
     ]
    }
   ],
   "source": [
    "# Clone/Download the model files from Meta HF repo: https://huggingface.co/meta-llama. Or feel free to clone from my HF repo:\n",
    "%cd models\n",
    "!git lfs install\n",
    "!git clone https://huggingface.co/JaaackXD/Llama-3-8B-GGUF\n",
    "# !git clone https://huggingface.co/JaaackXD/Llama-3-70B-GGUF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "603a64eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp\n"
     ]
    }
   ],
   "source": [
    "!mkdir 8B-v3\n",
    "!mv Llama-3-8B-GGUF/*.gguf 8B-v3\n",
    "# !mkdir 70B-v3\n",
    "# !mv Llama-3-70B-GGUF/*.gguf 70B-v3\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "43e9d80e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE \n",
      "I NVCCFLAGS: -std=c++11 -O3 \n",
      "I LDFLAGS:    \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "\n",
      "rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o tests/test-autorelease tests/test-backend-ops tests/test-double-float tests/test-grad0 tests/test-grammar-integration tests/test-grammar-parser tests/test-json-schema-to-grammar tests/test-llama-grammar tests/test-model-load-cancel tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-rope tests/test-sampling tests/test-tokenizer-0 tests/test-tokenizer-1-bpe tests/test-tokenizer-1-spm\n",
      "rm -vrf ggml-cuda/*.o\n",
      "find examples pocs -type f -name \"*.o\" -delete\n",
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS \n",
      "I NVCCFLAGS: -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \n",
      "I LDFLAGS:   -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I NVCC:      Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "\n",
      "!!!!\n",
      "LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.\n",
      "!!!!\n",
      "\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml.c -o ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c llama.cpp -o llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/common.cpp -o common.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/sampling.cpp -o sampling.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/grammar-parser.cpp -o grammar-parser.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/json-schema-to-grammar.cpp -o json-schema-to-grammar.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/console.cpp -o console.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c sgemm.cpp -o sgemm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda.cu -o ggml-cuda.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/acc.cu -o ggml-cuda/acc.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/alibi.cu -o ggml-cuda/alibi.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/arange.cu -o ggml-cuda/arange.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/argsort.cu -o ggml-cuda/argsort.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/binbcast.cu -o ggml-cuda/binbcast.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/clamp.cu -o ggml-cuda/clamp.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/concat.cu -o ggml-cuda/concat.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/convert.cu -o ggml-cuda/convert.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/cpy.cu -o ggml-cuda/cpy.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/diagmask.cu -o ggml-cuda/diagmask.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/dmmv.cu -o ggml-cuda/dmmv.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/fattn.cu -o ggml-cuda/fattn.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/getrows.cu -o ggml-cuda/getrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/im2col.cu -o ggml-cuda/im2col.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmq.cu -o ggml-cuda/mmq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmvq.cu -o ggml-cuda/mmvq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/norm.cu -o ggml-cuda/norm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pad.cu -o ggml-cuda/pad.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pool2d.cu -o ggml-cuda/pool2d.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/quantize.cu -o ggml-cuda/quantize.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/rope.cu -o ggml-cuda/rope.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/scale.cu -o ggml-cuda/scale.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/softmax.cu -o ggml-cuda/softmax.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/sumrows.cu -o ggml-cuda/sumrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/tsembd.cu -o ggml-cuda/tsembd.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/unary.cu -o ggml-cuda/unary.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/upscale.cu -o ggml-cuda/upscale.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-alloc.c -o ggml-alloc.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-backend.c -o ggml-backend.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion     -c ggml-quants.c -o ggml-quants.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode.cpp -o unicode.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode-data.cpp -o unicode-data.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/train.cpp -o train.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/ngram-cache.cpp -o ngram-cache.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion  -c tests/test-c.c -o tests/test-c.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/build-info.cpp -o build-info.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/main/main.cpp -o examples/main/main.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize/quantize.cpp -o examples/quantize/quantize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize-stats/quantize-stats.cpp -o examples/quantize-stats/quantize-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/perplexity/perplexity.cpp -o examples/perplexity/perplexity.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/imatrix/imatrix.cpp -o examples/imatrix/imatrix.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/embedding/embedding.cpp -o examples/embedding/embedding.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/vdot.cpp -o pocs/vdot/vdot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/q8dot.cpp -o pocs/vdot/q8dot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/train-text-from-scratch/train-text-from-scratch.cpp -o examples/train-text-from-scratch/train-text-from-scratch.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/simple/simple.cpp -o examples/simple/simple.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched/batched.cpp -o examples/batched/batched.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched-bench/batched-bench.cpp -o examples/batched-bench/batched-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/save-load-state/save-load-state.cpp -o examples/save-load-state/save-load-state.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/server/server.cpp -o examples/server/server.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf/gguf.cpp -o examples/gguf/gguf.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf-split/gguf-split.cpp -o examples/gguf-split/gguf-split.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/eval-callback/eval-callback.cpp -o examples/eval-callback/eval-callback.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llama-bench/llama-bench.cpp -o examples/llama-bench/llama-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava-cli.cpp -o examples/llava/llava-cli.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/baby-llama/baby-llama.cpp -o examples/baby-llama/baby-llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/beam-search/beam-search.cpp -o examples/beam-search/beam-search.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/retrieval/retrieval.cpp -o examples/retrieval/retrieval.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/speculative/speculative.cpp -o examples/speculative/speculative.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/infill/infill.cpp -o examples/infill/infill.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/tokenize/tokenize.cpp -o examples/tokenize/tokenize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/parallel/parallel.cpp -o examples/parallel/parallel.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/finetune/finetune.cpp -o examples/finetune/finetune.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/export-lora/export-lora.cpp -o examples/export-lora/export-lora.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookahead/lookahead.cpp -o examples/lookahead/lookahead.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup.cpp -o examples/lookup/lookup.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/passkey/passkey.cpp -o examples/passkey/passkey.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gritlm/gritlm.cpp -o examples/gritlm/gritlm.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf/gguf.o -o gguf -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/baby-llama/baby-llama.o -o baby-llama -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/vdot.o -o vdot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/benchmark/benchmark-matmult.o -o benchmark-matmult -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/eval-callback/eval-callback.o -o eval-callback -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/q8dot.o -o q8dot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/tokenize/tokenize.o -o tokenize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o common.o sampling.o grammar-parser.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched-bench/batched-bench.o -o batched-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/export-lora/export-lora.o -o export-lora -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/save-load-state/save-load-state.o -o save-load-state -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/beam-search/beam-search.o -o beam-search -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize/quantize.o -o quantize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/parallel/parallel.o -o parallel -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gritlm/gritlm.o -o gritlm -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/simple/simple.o -o simple -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/passkey/passkey.o -o passkey -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf-split/gguf-split.o -o gguf-split -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/embedding/embedding.o -o embedding -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched/batched.o -o batched -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookahead/lookahead.o -o lookahead -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup.o -o lookup -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/imatrix/imatrix.o -o imatrix -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/clip.cpp  -o examples/llava/clip.o -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/retrieval/retrieval.o -o retrieval -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/train-text-from-scratch/train-text-from-scratch.o -o train-text-from-scratch -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-create.cpp -o examples/lookup/lookup-create.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o -o convert-llama2c-to-ggml -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/infill/infill.o -o infill -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/speculative/speculative.o -o speculative -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/main/main.o -o main -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/finetune/finetune.o -o finetune -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "\n",
      "====  Run ./main -h for help.  ====\n",
      "\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-create.o -o lookup-create -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-merge.cpp -o examples/lookup/lookup-merge.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/perplexity/perplexity.o -o perplexity -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-merge.o -o lookup-merge -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-stats.cpp -o examples/lookup/lookup-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize-stats/quantize-stats.o -o quantize-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-stats.o -o lookup-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llama-bench/llama-bench.o -o llama-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava.cpp -o examples/llava/llava.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llava/llava-cli.o examples/llava/clip.o examples/llava/llava.o -o llava-cli -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o -Iexamples/server examples/server/server.o -o server -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib  \n"
     ]
    }
   ],
   "source": [
    "# Build\n",
    "!make clean && LLAMA_CUBLAS=1 make -j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3ca0e26e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))\n",
      "  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
      "Collecting sentencepiece~=0.2.0 (from -r ./requirements/requirements-convert.txt (line 2))\n",
      "  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
      "Collecting transformers<5.0.0,>=4.40.1 (from -r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hCollecting gguf>=0.1.0 (from -r ./requirements/requirements-convert.txt (line 4))\n",
      "  Downloading gguf-0.6.0-py3-none-any.whl.metadata (3.2 kB)\n",
      "Collecting protobuf<5.0.0,>=4.21.0 (from -r ./requirements/requirements-convert.txt (line 5))\n",
      "  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)\n",
      "Collecting torch~=2.1.1 (from -r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.13.1)\n",
      "Collecting huggingface-hub<1.0,>=0.19.3 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (23.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (6.0.1)\n",
      "Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.31.0)\n",
      "Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
      "Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
      "Collecting tqdm>=4.27 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (4.9.0)\n",
      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.12)\n",
      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.2.1)\n",
      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.1.3)\n",
      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2024.2.0)\n",
      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (8.9.2.26)\n",
      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.3.1)\n",
      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.0.2.54)\n",
      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (10.3.2.106)\n",
      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.4.5.107)\n",
      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.0.106)\n",
      "Collecting nvidia-nccl-cu12==2.18.1 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n",
      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Collecting triton==2.1.0 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)\n",
      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.3.101)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2.1.5)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.6)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.2.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2024.2.2)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.3.0)\n",
      "Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m78.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading transformers-4.40.2-py3-none-any.whl (9.0 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.0/9.0 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hDownloading gguf-0.6.0-py3-none-any.whl (23 kB)\n",
      "Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m670.2/670.2 MB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.8/209.8 MB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.2/89.2 MB\u001b[0m \u001b[31m61.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.2/401.2 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.1/774.1 kB\u001b[0m \u001b[31m53.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m59.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m81.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
      "\u001b[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.3/78.3 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: sentencepiece, triton, tqdm, safetensors, regex, protobuf, nvidia-nccl-cu12, numpy, huggingface-hub, gguf, torch, tokenizers, transformers\n",
      "  Attempting uninstall: triton\n",
      "    Found existing installation: triton 2.2.0\n",
      "    Uninstalling triton-2.2.0:\n",
      "      Successfully uninstalled triton-2.2.0\n",
      "  Attempting uninstall: nvidia-nccl-cu12\n",
      "    Found existing installation: nvidia-nccl-cu12 2.19.3\n",
      "    Uninstalling nvidia-nccl-cu12-2.19.3:\n",
      "      Successfully uninstalled nvidia-nccl-cu12-2.19.3\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.26.3\n",
      "    Uninstalling numpy-1.26.3:\n",
      "      Successfully uninstalled numpy-1.26.3\n",
      "  Attempting uninstall: torch\n",
      "    Found existing installation: torch 2.2.0\n",
      "    Uninstalling torch-2.2.0:\n",
      "      Successfully uninstalled torch-2.2.0\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "torchaudio 2.2.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\n",
      "torchvision 0.17.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed gguf-0.6.0 huggingface-hub-0.23.0 numpy-1.24.4 nvidia-nccl-cu12-2.18.1 protobuf-4.25.3 regex-2024.4.28 safetensors-0.4.3 sentencepiece-0.2.0 tokenizers-0.19.1 torch-2.1.2 tqdm-4.66.4 transformers-4.40.2 triton-2.1.0\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "# Run the code if you would like to convert and quantize models by yourself\n",
    "# Install Python dependencies\n",
    "!python3 -m pip install -r requirements.txt\n",
    "\n",
    "# # Convert the HF models to ggml FP16 format (High RAM requirement!)\n",
    "# !python3 convert-hf-to-gguf.py models/Llama-3-8B-GGUF/Meta-Llama-3-8B/ --outfile models/8B-v3/ggml-model-f16.gguf --outtype f16\n",
    "# !python3 convert-hf-to-gguf.py models/Llama-3-70B-GGUF/Meta-Llama-3-70B/ --outfile models/70B-v3/ggml-model-f16.gguf --outtype f16\n",
    "\n",
    "# # Quantize the model to 4-bits (using Q4_K_M method)\n",
    "# !./quantize ./models/8B-v3/ggml-model-f16.gguf ./models/8B-v3/ggml-model-Q4_K_M.gguf Q4_K_M\n",
    "# !./quantize ./models/70B-v3/ggml-model-f16.gguf ./models/70B-v3/ggml-model-Q4_K_M.gguf Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "30466fa3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Log start\n",
      "main: build = 2824 (4426e298)\n",
      "main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n",
      "main: seed  = 0\n",
      "llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ./models/8B-v3/ggml-model-Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B\n",
      "llama_model_loader: - kv   2:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   3:                       llama.context_length u32              = 8192\n",
      "llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256\n",
      "llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = [\"Ġ Ġ\", \"Ġ ĠĠĠ\", \"ĠĠ ĠĠ\", \"...\n",
      "llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000\n",
      "llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128001\n",
      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q4_K:  193 tensors\n",
      "llama_model_loader: - type q6_K:   33 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 256/128256 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 128256\n",
      "llm_load_print_meta: n_merges         = 280147\n",
      "llm_load_print_meta: n_ctx_train      = 8192\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_embd_head_k    = 128\n",
      "llm_load_print_meta: n_embd_head_v    = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
      "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: f_logit_scale    = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: causal attn      = 1\n",
      "llm_load_print_meta: pooling type     = 0\n",
      "llm_load_print_meta: rope type        = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 500000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 8192\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: ssm_d_conv       = 0\n",
      "llm_load_print_meta: ssm_d_inner      = 0\n",
      "llm_load_print_meta: ssm_d_state      = 0\n",
      "llm_load_print_meta: ssm_dt_rank      = 0\n",
      "llm_load_print_meta: model type       = 8B\n",
      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 8.03 B\n",
      "llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW) \n",
      "llm_load_print_meta: general.name     = Meta-Llama-3-8B\n",
      "llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'\n",
      "llm_load_print_meta: EOS token        = 128001 '<|end_of_text|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'\n",
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 3070, compute capability 8.6, VMM: yes\n",
      "llm_load_tensors: ggml ctx size =    0.30 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =   281.81 MiB\n",
      "llm_load_tensors:      CUDA0 buffer size =  4403.49 MiB\n",
      "........................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 8192\n",
      "llama_new_context_with_model: n_batch    = 2048\n",
      "llama_new_context_with_model: n_ubatch   = 512\n",
      "llama_new_context_with_model: flash_attn = 0\n",
      "llama_new_context_with_model: freq_base  = 500000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB\n",
      "llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB\n",
      "llama_new_context_with_model:  CUDA_Host  output buffer size =     0.49 MiB\n",
      "llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB\n",
      "llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB\n",
      "llama_new_context_with_model: graph nodes  = 1030\n",
      "llama_new_context_with_model: graph splits = 2\n",
      "\n",
      "system_info: n_threads = 24 / 48 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \n",
      "sampling: \n",
      "\trepeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000\n",
      "\ttop_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 1.100\n",
      "\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\n",
      "sampling order: \n",
      "CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature \n",
      "generate: n_ctx = 8192, n_batch = 2048, n_predict = 1024, n_keep = 0\n",
      "\n",
      "\n",
      "\u001b[33m<|begin_of_text|> First Citizen:\n",
      "\n",
      " Before we proceed any further, hear me speak.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Speak, speak.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " You are all resolved rather to die than to famish?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Resolved. resolved.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " First, you know Caius Marcius is chief enemy to the people.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " We know't, we know't.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " Let us kill him, and we'll have corn at our own price. Is't a verdict?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " No more talking on't; let it be done: away, away!\n",
      "\n",
      " \n",
      "\n",
      " Second Citizen:\n",
      "\n",
      " One word, good citizens.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity,  while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of  our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes,  ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\n",
      "\n",
      " \n",
      "\n",
      " \u001b[0m2d Cit:\n",
      "\n",
      " If there were no more but the empty titles madam, then would they undock the vessel of their maiden fortunes, and yet show wisely in their rewindings that our sons may see even out of their ashes by which time they may do it to their hearts' content; if these be men o'four and twenty, let them use apes enough;  if these be men made after his pattern, if the sea e'er drowned them 'tis all one to the girdle of it. No, there is more in this: princes have low'd for their subjects, not subjects for their princes.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " You are i'faith a dozen and sixer too many; go you men twelve; let the people see the morning's work, and then come home to lunch. Come!\n",
      "\n",
      " \n",
      "\n",
      " Third Citizen:\n",
      "\n",
      " What time o'night is't?\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " I'll show you opinions: God forbid! An it should stir obedience, armed with no more than  poor and plain opposites.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " The gods defend us! Remember who they are, I will not fear to show men's wickedness. [Sennet within.] You that shall report my sayings, the instant upon me; put your garters on and alter not a syllable of them.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " Let them be which are about you here.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Ay, ay, by any means!\n",
      "\n",
      " \n",
      "\n",
      " 2d Cit:\n",
      "\n",
      " Strike out, or strike up; words, no action; you do but dally where you should drive the hazard: blindholed, you must play with your  lives.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " Pray all be merry; let's have a song.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " We are so  rid of them that we must faint to follow them. [Song]\n",
      "\n",
      "  \n",
      "\n",
      "    The game's afoot; follow your spirit;\n",
      "      Show the green-sword, and thus end our forget.\n",
      " \n",
      "<|end_of_text|> [end of text]\n",
      "\n",
      "llama_print_timings:        load time =    2149.98 ms\n",
      "llama_print_timings:      sample time =     185.98 ms /   397 runs   (    0.47 ms per token,  2134.67 tokens per second)\n",
      "llama_print_timings: prompt eval time =     152.07 ms /   258 tokens (    0.59 ms per token,  1696.54 tokens per second)\n",
      "llama_print_timings:        eval time =    5514.96 ms /   396 runs   (   13.93 ms per token,    71.80 tokens per second)\n",
      "llama_print_timings:       total time =    6139.91 ms /   654 tokens\n",
      "Log end\n"
     ]
    }
   ],
   "source": [
    "# Start inference on a gguf model (-h to show all options)\n",
    "!./main -ngl 10000 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf --color --temp 1.1 --repeat_penalty 1.1 -c 0 -n 1024 -e -s 0 -p \"\"\"\\\n",
    "First Citizen:\\n\\n\\\n",
    "Before we proceed any further, hear me speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Speak, speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "You are all resolved rather to die than to famish?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Resolved. resolved.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "First, you know Caius Marcius is chief enemy to the people.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "We know't, we know't.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "Let us kill him, and we'll have corn at our own price. Is't a verdict?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "No more talking on't; let it be done: away, away!\\n\\n\\\n",
    "\\n\\n\\\n",
    "Second Citizen:\\n\\n\\\n",
    "One word, good citizens.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity, \\\n",
    "while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of \\\n",
    "our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes, \\\n",
    "ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\\n\\n\\\n",
    "\\n\\n\\\n",
    "\"\"\"\n",
    "\n",
    "# # Chat template for Termianl (Use the instruction-tuned models to better follow the template)\n",
    "# !./main -ngl 10000 -m ./models/8B-v3-instruct/ggml-model-Q4_K_M.gguf --color -c 0 -n -2 -e -s 0 --mirostat 2 -i --no-display-prompt --keep -1 \\\n",
    "# -r '<|eot_id|>' -p '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nHi!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' \\\n",
    "# --in-prefix '<|start_header_id|>user<|end_header_id|>\\n\\n' --in-suffix '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17c21608",
   "metadata": {},
   "source": [
    "## Benchmarks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b6aee37",
   "metadata": {},
   "source": [
    "### 8B Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "996ee79e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 3070, compute capability 8.6, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl | test       |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 512     |  2402.51 ± 11.82 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 1024    |   2283.62 ± 3.28 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 4096    |   1826.59 ± 1.35 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | pp 8192    |   1419.97 ± 0.73 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 512     |     72.79 ± 0.13 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 1024    |     70.94 ± 0.84 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 4096    |     67.01 ± 0.41 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 | tg 8192    |     61.64 ± 0.10 |\n",
      "\n",
      "build: 4426e298 (2824)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
